***********************************************************************************
* Linking Occupational Characteristics 
* last modified 7/16/2025
*
* Note: clean National Employment Matrix
*
***********************************************************************************


clear
capture log close 
set more off

cd "/Users/"

global dofile			=	"./Code/NEW-OEWS"
global matrix_data		=   "./Data/National Employment Matrix"
global oews_data		= 	"./Data/OEWS/all_excel"
global onet_data		= 	"./OccLink/ONET/ONET data/merged"
global results 			=	"./Results"
global temp				= 	"./Temp"

*******************************************************************************
** Clean the national employment matrix data
*******************************************************************************

** clean 1998 matrix data
import excel "${matrix_data}/matrix1998-08/PBMT9808.xlsx", sheet("main") cellrange(A1:K82502) case(lower) firstrow clear
drop combined_code
gen year = 1998 

/*Of the 650 occupations, 521 are detail and 129 are summary occupations. 
There are 246 detail and 68 summary industries in these industry-
occupation matrices.  Total employment or the data contained within the 
'Total, all industries' (000001) industry is the sum of the following 
industries*/
keep if inlist(sic, "000001")
destring baseempsize pc_ind_base pc_occ_base proj pc_occ_proj pc_ind_proj, replace

save "${temp}/matrix1998.dta", replace


** clean 2000 matrix data
import excel "${matrix_data}/matrix2000-10/PBMT0010.xlsx", sheet("main") cellrange(A1:K89299) case(lower) firstrow clear
drop combined_code
gen year = 2000 

/*Note that the first 897 records of PBMT0010.TXT are for those
occupations included in the 'Total, all industries' industry (6-digit 
code 000001)*/
keep if inlist(sic, "000001")

save "${temp}/matrix2000.dta", replace

** clean 2002 matrix data
import excel "${matrix_data}/matrix2002-12/PBMT0212.xlsx", sheet("main") cellrange(A1:K116326) firstrow case(lower) clear
drop combined_code
gen year = 2002

/*Note that the first 980 records of PBMT0212.TXT are for those
occupations included in the 'Total, all industries' industry (6-byte 
code 000001). */
keep if inlist(naics, "000001")

save "${temp}/matrix2002", replace

** clean 2004 matrix data
import excel "${matrix_data}/matrix2004-14/pbmt0414.xlsx", sheet("main") cellrange(A1:J128840) firstrow case(lower) clear
gen year = 2004 
/*Of the 1022 occupations, 753 are detail and 269 are summary occupations. 
Of the 502 industries, there are 336 detail and 166 summary industries in these industry-
occupation matrices.  Total employment or the data contained within the 
'Total, all industries' (000001) industry is the sum of the following 4
summary industries:

  Self-employed workers, all jobs (000600)
  Unpaid family workers, all jobs (000700)  
  Wage and salary workers, second job in agricultural production, forestry,
    fishing, or private households (000510)
  Wage and salary workers, all industries except second job in agricultural
    production, forestry, fishing, or private households (000520)*/

keep if inlist(naics, "000001")
	
save "${temp}/matrix2004.dta", replace

** clean 2006 matrix data
import excel "${matrix_data}/matrix2006-16/pbmt0616.xlsx", sheet("main") cellrange(A1:J118903) firstrow case(lower) clear
gen year = 2006

/*Of the 1021 occupations, 753 are detail and 268 are summary occupations. 
Of the 475 industries, there are 311 detail and 164 
summary industries in these industry-occupation matrices.  Total
employment of the data contained within the 
'Total, all industries' (000001) industry is the sum of the following 4
summary industries:

  Self-employed workers, all jobs (000600)
  Unpaid family workers, all jobs (000700)  
  Wage and salary workers, second job in agricultural production, forestry,
    fishing, or private households (000510)
  Wage and salary workers, all industries except second job in agricultural
    production, forestry, fishing, or private households (000520)*/

keep if inlist(naics, "000001")

save "${temp}/matrix2006.dta", replace

** clean 2008 matrix data
import excel "${matrix_data}/matrix2008-18/PBMT0818.xlsx", sheet("main") cellrange(A1:J111485) firstrow case(lower) clear
gen year = 2008
keep if inlist(naics, "TOT001")

save "${temp}/matrix2008.dta", replace

** clean 2010 matrix data
import excel "${matrix_data}/matrix2010/Matrix.xlsx", sheet("main") cellrange(A1:J115474) firstrow case(lower) clear
gen year = 2010
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2010.dta", replace

** clean 2012 matrix data
import excel "${matrix_data}/matrix2012.xlsx", sheet("main") cellrange(A1:J118443) firstrow case(lower) clear
gen year = 2012
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2012.dta", replace

** clean 2014 matrix data
import excel "${matrix_data}/matrix2014.xlsx", sheet("main") cellrange(A1:L118319) firstrow case(lower) clear
drop occupationtitle industrytitle
gen year = 2014
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2014.dta", replace

** clean 2016 matrix data
import excel "${matrix_data}/matrix2016.xlsx", sheet("main") cellrange(A1:L119176) firstrow case(lower) clear
drop occupationtitle industrytitle
gen year = 2016
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2016.dta", replace

** clean 2018 matrix data
import excel "${matrix_data}/matrix2018.xlsx", sheet("main") cellrange(A1:L110576) firstrow case(lower) clear
drop occupationtitle industrytitle
gen year = 2018
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2018.dta", replace

** clean 2019 matrix data
import excel "${matrix_data}/matrix2019.xlsx", sheet("main") cellrange(A1:L108849) firstrow case(lower) clear
drop occupationtitle industrytitle
gen year = 2019
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2019.dta", replace

** clean 2020 matrix data
import excel "${matrix_data}/matrix2020.xlsx", sheet("main") cellrange(A1:L107532) firstrow case(lower) clear
drop occupationtitle industrytitle
gen year = 2020
keep if inlist(naics, "TE1000") /*pc_ind_base = 100*/

save "${temp}/matrix2020.dta", replace


** append all data 
use "${temp}/matrix1998.dta", clear
append using "${temp}/matrix2000" "${temp}/matrix2002" "${temp}/matrix2004" "${temp}/matrix2006" "${temp}/matrix2008" ///
			 "${temp}/matrix2010" "${temp}/matrix2012" "${temp}/matrix2014" "${temp}/matrix2016" ///
			 "${temp}/matrix2018" "${temp}/matrix2019" "${temp}/matrix2020" 
			 
save "${temp}/matrix98-20combined.dta", replace


** keep occupation information and remove occupation * industry estimates
use "${temp}/matrix98-20combined.dta", replace

/* replace empsize: employment size counted in thousands after 2014 */

replace baseempsize = baseempsize*1000 if year >= 2010
replace num_chng    = num_chng*1000    if year >= 2010 /*projected number change in the next 10 yrs*/
replace proj        = proj * 1000      if year >= 2010

keep soc-naics

save "${temp}/matrix98-20combined.dta", replace


*******************************************************************************
** Clean the job vacancy data
*******************************************************************************

** clean 2000 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2000linked_code") cellrange(A2:I879) case(lower) clear

keep B I

rename B soc

gen jobopening = I * 1000
gen year = 2000 

drop I

drop if soc == ""

save "${temp}/jobopening2000.dta", replace



** clean 2002 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2002") cellrange(A2:I980) case(lower) clear

keep B I

rename B soc

gen jobopening = I * 1000
gen year = 2002

drop I

drop if soc == ""

save "${temp}/jobopening2002.dta", replace



** clean 2004 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2004") cellrange(A2:I1023) case(lower) clear

keep A I

rename A soc

gen jobopening = I * 1000
gen year = 2004

drop I

drop if soc == ""

save "${temp}/jobopening2004.dta", replace


** clean 2006 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2006") cellrange(A2:I1022) case(lower) clear

* fix some coding errors 
replace A = "11-1000" if B == "Top executives"
replace A = "11-0000" if B == "Management occupations"
replace A = "11-1300" if B == "Management, business, and financial occupations"

keep A I

rename A soc

gen jobopening = I * 1000
gen year = 2006

drop I

drop if soc == ""

save "${temp}/jobopening2006.dta", replace


** clean 2008 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2008") cellrange(A5:H1021) case(lower) clear
keep B H

rename B soc

gen jobopening = H * 1000

gen year = 2008

drop H

drop if soc == ""

save "${temp}/jobopening2008.dta", replace


** clean 2010 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2010") cellrange(A3:H1011) case(lower) clear
keep B H

rename B soc

gen jobopening = H * 1000
gen year = 2010

drop H

drop if soc == ""

save "${temp}/jobopening2010.dta", replace


** clean 2012 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2012") cellrange(A3:H1093) case(lower) clear
keep B H

rename B soc

gen jobopening = H * 1000
gen year = 2012

drop H 

drop if soc == ""

save "${temp}/jobopening2012.dta", replace


** clean 2014 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2014") cellrange(A3:I1093) case(lower) clear
keep B I

rename B soc

gen jobopening = I * 1000
gen year = 2014

drop I

drop if soc == ""

save "${temp}/jobopening2014.dta", replace


** clean 2016 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2016") cellrange(A3:N1093) case(lower) clear
keep B N

rename B soc

gen jobopening = N * 1000
gen year = 2016

drop N

drop if soc == ""

save "${temp}/jobopening2016.dta", replace


** clean 2018 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2018") cellrange(A3:N1078) case(lower) clear
keep B N

rename B soc

gen jobopening = N * 1000
gen year = 2018

drop N

drop if soc == ""

save "${temp}/jobopening2018.dta", replace


** clean 2019 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2019") cellrange(A2:N1049) case(lower) clear
keep B N

rename B soc

gen jobopening = N * 1000
gen year = 2019

drop N

drop if soc == ""

save "${temp}/jobopening2019.dta", replace


** clean 2020 job opening data
import excel "${matrix_data}/occupation2000_2020.xlsx", sheet("2020") cellrange(A2:N1049) case(lower) clear
keep B N

rename B soc

gen jobopening = N * 1000
gen year = 2020

drop N

drop if soc == ""

save "${temp}/jobopening2020.dta", replace


** append all data 
use "${temp}/jobopening2000.dta", clear
append using "${temp}/jobopening2002" "${temp}/jobopening2004" "${temp}/jobopening2006" ///
			 "${temp}/jobopening2008" "${temp}/jobopening2010" "${temp}/jobopening2012" ///
			 "${temp}/jobopening2014" "${temp}/jobopening2016" "${temp}/jobopening2018" ///
			 "${temp}/jobopening2019" "${temp}/jobopening2020" 
			 
save "${temp}/jobopening00-20combined.dta", replace



